/*
 * Copyright (C) 2008, 2011 The Android Open Source Project
 * Copyright (C) 2010 ST-Ericsson SA
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <machine/cpu-features.h>
#include <machine/asm.h>

/*
 * Optimized memcmp() for ARM9 and Cortex-A9
 */

#if __ARM_ARCH__ >= 7
#define __ARM_CORTEX

#if defined(CORTEX_CACHE_LINE_32)
#define CACHE_LINE_SIZE     32
#else
#define CACHE_LINE_SIZE     64
#endif

#endif /* __ARM_ARCH__ */

ENTRY(memcmp)
#if defined(__ARM_CORTEX)
        pld         [r0, #(CACHE_LINE_SIZE * 0)]
        pld         [r0, #(CACHE_LINE_SIZE * 1)]
#else
        PLD         (r0, #0)
        PLD         (r1, #0)
#endif

        /* take of the case where length is 0 or the buffers are the same */
        cmp         r0, r1
#if !defined(__ARM_CORTEX)
        cmpne       r2, #0
#endif
        moveq       r0, #0
        bxeq        lr

#if defined(__ARM_CORTEX)
        pld         [r1, #(CACHE_LINE_SIZE * 0)]
        pld         [r1, #(CACHE_LINE_SIZE * 1)]

        /* make sure we have at least 8+4 bytes, this simplify things below
         * and avoid some overhead for small blocks
         */
        cmp        r2, #(8+4)
        bmi        10f
#endif /* __ARM_CORTEX */

        .save {r4, lr}
        /* save registers */
        stmfd       sp!, {r4, lr}

#if !defined(__ARM_CORTEX)
        PLD         (r0, #32)
        PLD         (r1, #32)
#endif

        /* since r0 hold the result, move the first source
         * pointer somewhere else
         */
         
         mov        r4, r0

#if !defined(__ARM_CORTEX)
         /* make sure we have at least 8+4 bytes, this simplify things below
          * and avoid some overhead for small blocks
          */
         cmp        r2, #(8+4)
         bmi        8f
#endif

        /* align first pointer to word boundary
         * offset = -src & 3
         */
        rsb         r3, r4, #0
        ands        r3, r3, #3
        beq         0f

        /* align first pointer  */
        sub         r2, r2, r3
1:      ldrb        r0, [r4], #1
        ldrb        ip, [r1], #1
        subs        r0, r0, ip
        bne         9f
        subs        r3, r3, #1
        bne         1b


0:      /* here the first pointer is aligned, and we have at least 4 bytes
         * to process.
         */

        /* see if the pointers are congruent */
        eor         r0, r4, r1
        ands        r0, r0, #3
        bne         5f

        /* congruent case, 32 bytes per iteration
         * We need to make sure there are at least 32+4 bytes left
         * because we effectively read ahead one word, and we could
         * read past the buffer (and segfault) if we're not careful.
         */

        ldr         ip, [r1]
        subs        r2, r2, #(32 + 4)
        bmi         1f
        
0:
#if defined(__ARM_CORTEX)
        pld         [r4, #(CACHE_LINE_SIZE * 2)]
        pld         [r1, #(CACHE_LINE_SIZE * 2)]
#else
        PLD         (r4, #64)
        PLD         (r1, #64)
#endif
        ldr         r0, [r4], #4
        ldr         lr, [r1, #4]!
        eors        r0, r0, ip
        ldreq       r0, [r4], #4
        ldreq       ip, [r1, #4]!
        eoreqs      r0, r0, lr
        ldreq       r0, [r4], #4
        ldreq       lr, [r1, #4]!
        eoreqs      r0, r0, ip
        ldreq       r0, [r4], #4
        ldreq       ip, [r1, #4]!
        eoreqs      r0, r0, lr
        ldreq       r0, [r4], #4
        ldreq       lr, [r1, #4]!
        eoreqs      r0, r0, ip
        ldreq       r0, [r4], #4
        ldreq       ip, [r1, #4]!
        eoreqs      r0, r0, lr
        ldreq       r0, [r4], #4
        ldreq       lr, [r1, #4]!
        eoreqs      r0, r0, ip
        ldreq       r0, [r4], #4
        ldreq       ip, [r1, #4]!
        eoreqs      r0, r0, lr
        bne         2f        
        subs        r2, r2, #32
        bhs         0b

        /* do we have at least 4 bytes left? */
1:      adds        r2, r2, #(32 - 4 + 4)
        bmi         4f
        
        /* finish off 4 bytes at a time */
3:      ldr         r0, [r4], #4
        ldr         ip, [r1], #4
        eors        r0, r0, ip
        bne         2f
        subs        r2, r2, #4
        bhs         3b

        /* are we done? */
4:      adds        r2, r2, #4
        moveq       r0, #0
        beq         9f

        /* finish off the remaining bytes */
        b           8f

2:      /* the last 4 bytes are different, restart them */
        sub         r4, r4, #4
        sub         r1, r1, #4
        mov         r2, #4

        /* process the last few bytes */
8:      ldrb        r0, [r4], #1
        ldrb        ip, [r1], #1
        // stall
        subs        r0, r0, ip
        bne         9f
        subs        r2, r2, #1
        bne         8b

9:      /* restore registers and return */
        ldmfd       sp!, {r4, lr}
        bx          lr

#if defined(__ARM_CORTEX)
10:     /* process less than 12 bytes */
        cmp         r2, #0
        moveq       r0, #0
        bxeq        lr
        mov         r3, r0
11:
        ldrb        r0, [r3], #1
        ldrb        ip, [r1], #1
        subs        r0, ip
        bxne        lr
        subs        r2, r2, #1
        bne         11b
        bx          lr
#endif /* __ARM_CORTEX */
END(memcmp)





5:      /*************** non-congruent case ***************/
        and         r0, r1, #3      
        cmp         r0, #2
        bne         4f

        /* here, offset is 2 (16-bits aligned, special cased) */
        
        /* make sure we have at least 16 bytes to process */
        subs        r2, r2, #16
        addmi       r2, r2, #16
        bmi         8b

        /* align the unaligned pointer */
        bic         r1, r1, #3
        ldr         lr, [r1], #4

6:
#if defined(__ARM_CORTEX)
        pld         [r1, #(CACHE_LINE_SIZE * 2)]
        pld         [r4, #(CACHE_LINE_SIZE * 2)]
#else
        PLD         (r1, #64)
        PLD         (r4, #64)
#endif
        mov         ip, lr, lsr #16
        ldr         lr, [r1], #4
        ldr         r0, [r4], #4
        orr         ip, ip, lr, lsl #16
        eors        r0, r0, ip
        moveq       ip, lr, lsr #16
        ldreq       lr, [r1], #4
        ldreq       r0, [r4], #4
        orreq       ip, ip, lr, lsl #16
        eoreqs      r0, r0, ip
        moveq       ip, lr, lsr #16
        ldreq       lr, [r1], #4
        ldreq       r0, [r4], #4
        orreq       ip, ip, lr, lsl #16
        eoreqs      r0, r0, ip
        moveq       ip, lr, lsr #16
        ldreq       lr, [r1], #4
        ldreq       r0, [r4], #4
        orreq       ip, ip, lr, lsl #16
        eoreqs      r0, r0, ip
        bne         7f
        subs        r2, r2, #16
        bhs         6b
        sub         r1, r1, #2
        /* are we done? */
        adds        r2, r2, #16
        moveq       r0, #0
        beq         9b
        /* finish off the remaining bytes */
        b           8b

7:      /* fix up the 2 pointers and fallthrough... */
        sub         r1, r1, #(4+2)
        sub         r4, r4, #4
        mov         r2, #4
        b           8b


4:      /*************** offset is 1 or 3 (less optimized) ***************/

		stmfd		sp!, {r5, r6, r7}

        // r5 = rhs
        // r6 = lhs
        // r7 = scratch

        mov         r5, r0, lsl #3		/* r5 = right shift */
        rsb         r6, r5, #32         /* r6 = left shift */

        /* align the unaligned pointer */
        bic         r1, r1, #3
        ldr         r7, [r1], #4
        sub         r2, r2, #8

6:      mov         ip, r7, lsr r5
        ldr         r7, [r1], #4
        ldr         r0, [r4], #4
        orr         ip, ip, r7, lsl r6
        eors        r0, r0, ip
        moveq       ip, r7, lsr r5
        ldreq       r7, [r1], #4
        ldreq       r0, [r4], #4
        orreq       ip, ip, r7, lsl r6
        eoreqs      r0, r0, ip
        bne         7f
        subs        r2, r2, #8
        bhs         6b

        sub         r1, r1, r6, lsr #3
		ldmfd       sp!, {r5, r6, r7}

        /* are we done? */
        adds        r2, r2, #8
        moveq       r0, #0
        beq         9b

        /* finish off the remaining bytes */
        b           8b

7:      /* fix up the 2 pointers and fallthrough... */
        sub         r1, r1, #4
        sub         r1, r1, r6, lsr #3
        sub         r4, r4, #4
        mov         r2, #4
		ldmfd		sp!, {r5, r6, r7}
        b           8b
